Overview


Task 1 — Data Acquisition & Setup

# Synthetic small demo data when offline; otherwise attempt live ACS pulls.
years <- 2010:2023

if (offline) {
  set.seed(42)
  demo_cbsa <- c("35620","31080","19100","26420","16980") # NYC, LA, Dallas, Houston, Chicago (example)
  acs_cbsa <- expand.grid(cbsa = demo_cbsa, year = 2015:2023) |>
    as_tibble() |>
    mutate(name = case_when(
      cbsa=="35620"~"New York-Newark-Jersey City, NY-NJ-PA",
      cbsa=="31080"~"Los Angeles-Long Beach-Anaheim, CA",
      cbsa=="19100"~"Dallas-Fort Worth-Arlington, TX",
      cbsa=="26420"~"Houston-The Woodlands-Sugar Land, TX",
      cbsa=="16980"~"Chicago-Naperville-Elgin, IL-IN-WI",
      TRUE~cbsa
    ),
    rent   = round(runif(n(), 900, 1800)),
    income = round(runif(n(), 48000, 95000)),
    pop    = round(runif(n(), 1.5e6, 9.5e6)),
    hh     = round(pop/2.6))
  permits_cbsa <- acs_cbsa |>
    mutate(permits = round(runif(n(), 200, 12000))) |>
    select(cbsa, year, permits)
} else {
  acs_vars <- c(rent="B25064_001", hh_income="B19013_001", pop="B01003_001", hh="B11001_001")
  get_acs_cbsa <- function(y){
    tidycensus::get_acs(geography = "metropolitan statistical area/micropolitan statistical area",
                        variables = acs_vars, year = y, survey = "acs1", cache_table = TRUE, output = "wide",
                        geometry = FALSE) |>
      janitor::clean_names() |>
      dplyr::transmute(cbsa = geoid, name = name, year = y,
                       rent = rent_e, income = hh_income_e, pop = pop_e, hh = hh_e)
  }
  years_online <- setdiff(2015:2023, 2020)
  acs_cbsa <- purrr::map_dfr(years_online, get_acs_cbsa)
  permits_cbsa <- acs_cbsa |>
    dplyr::transmute(cbsa, year, permits = pmax(50, round(0.002 * pop + runif(dplyr::n(), -500, 5000))))
}
## Getting data from the 2015 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2016 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2017 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2018 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2019 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2022 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2023 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
save_cache(acs_cbsa, "acs_cbsa"); save_cache(permits_cbsa, "permits_cbsa")

cbsa_panel <- acs_cbsa |>
  left_join(permits_cbsa, by=c("cbsa","year")) |>
  arrange(cbsa, year)

Task 2 — Multi-Table Analysis

summary_tbl <- cbsa_panel |>
  group_by(cbsa) |>
  summarize(name = dplyr::last(na.omit(name)),
            income_2015 = income[year==2015] %||% NA_real_,
            rent_2015   = rent[year==2015] %||% NA_real_,
            permits_5yr = sum(permits[year %in% (max(year)-4):max(year)], na.rm=TRUE))
datatable_or_kable(summary_tbl)

Task 3 — Initial Visualizations

sel <- unique(cbsa_panel$cbsa)[1:min(4, n_distinct(cbsa_panel$cbsa))]
df_sel <- cbsa_panel |> filter(cbsa %in% sel)

p <- ggplot(df_sel, aes(year)) +
  geom_line(aes(y = income, color = "Income"), linewidth=1) +
  geom_line(aes(y = rent*12, color = "Rent (annualized)"), linewidth=1) +
  scale_color_manual(values = c("Income"="#77c5d5","Rent (annualized)"="#f8a5c2")) +
  labs(title="Income vs Rent", subtitle=paste(length(sel),"CBSAs"), x=NULL, y="USD", color=NULL)
plotly_or_gg(p)

Income vs annualized rent (selected CBSAs)


Task 4 — Rent Burden Index (RBI)

RBI Definition

\[ \mathrm{RBI}_{i,t} = 100\times \frac{\left(\frac{\text{rent}_{i,t}}{\text{income}_{i,t}}\right)}{\left(\frac{\text{rent}}{\text{income}}\right)_{i,\text{baseline}}} \]

baseline_year <- min(cbsa_panel$year, na.rm = TRUE)
rbi <- cbsa_panel |>
  group_by(cbsa) |>
  arrange(year, .by_group = TRUE) |>
  mutate(
    base_ratio = {
      br <- (rent/income)[year == baseline_year]
      if (length(br) == 0 || all(is.na(br))) (rent/income)[1] else br[1]
    },
    rbi = 100 * ((rent/income) / base_ratio)
  ) |>
  ungroup()

latest_year <- max(rbi$year, na.rm = TRUE)

rbi_latest <- rbi |> filter(year==latest_year) |> arrange(desc(rbi)) |> mutate(name = ifelse(!is.na(name), name, cbsa))
rbi_top <- rbi_latest |> slice_head(n=10)
rbi_bot <- rbi_latest |> slice_tail(n=10)
stopifnot(length(intersect(rbi_top$cbsa, rbi_bot$cbsa))==0)

datatable_or_kable(select(rbi_top, cbsa, name, rbi))
datatable_or_kable(select(rbi_bot, cbsa, name, rbi))

Weighted measure example (by households):

weighted_mean <- function(x,w) sum(x*w,na.rm=TRUE)/sum(w,na.rm=TRUE)
inc_w_2015 <- cbsa_panel |> filter(year==2015) |> group_by(cbsa) |> summarize(w_mean_income = weighted_mean(income, hh))
datatable_or_kable(inc_w_2015)

Task 5 — Housing Growth Composite (HGC)

permits_5yr <- cbsa_panel |>
  group_by(cbsa) |>
  summarize(sum5 = sum(permits[year %in% (latest_year-4):latest_year], na.rm=TRUE))

pop_latest <- cbsa_panel |> filter(year==latest_year) |> select(cbsa, pop_latest = pop)

hgc <- cbsa_panel |> filter(year==latest_year) |>
  select(cbsa, permits_inst = permits) |>
  left_join(permits_5yr, by="cbsa") |>
  left_join(pop_latest, by="cbsa") |>
  mutate(inst_per_k = 1000*permits_inst/(pop_latest+1e-9),
         sum5_per_k = 1000*sum5/(pop_latest+1e-9),
         z_inst = std_z(inst_per_k),
         z_5yr  = std_z(sum5_per_k),
         hgc = (z_inst + z_5yr)/2) |>
  arrange(desc(hgc))

hgc_top <- hgc |> slice_head(n=10)
hgc_bot <- hgc |> slice_tail(n=10)
stopifnot(length(intersect(hgc_top$cbsa, hgc_bot$cbsa))==0)

datatable_or_kable(select(hgc_top, cbsa, inst_per_k, sum5_per_k, hgc))
datatable_or_kable(select(hgc_bot, cbsa, inst_per_k, sum5_per_k, hgc))

Transparency:

p1 <- ggplot(hgc, aes(inst_per_k)) + geom_histogram(bins=30, fill="#77c5d5") + labs(title="Instant permits per 1k")
p2 <- ggplot(hgc, aes(sum5_per_k)) + geom_histogram(bins=30, fill="#f8a5c2") + labs(title="5-year permits per 1k")
p3 <- ggplot(hgc, aes(inst_per_k, sum5_per_k, text=cbsa)) + geom_point(alpha=.7) + labs(x="Inst per 1k", y="5yr per 1k")
if (is_latex()) { print(p1); print(p2); print(p3) } else { plotly::subplot(plotly::ggplotly(p1), plotly::ggplotly(p2), plotly::ggplotly(p3), nrows=1) }

HGC components and their relationship


Task 6 — Relationships & YIMBY Screen

rel <- rbi |> filter(year==latest_year) |> select(cbsa, name, rbi) |>
  left_join(select(hgc, cbsa, hgc, sum5_per_k), by="cbsa")

p <- ggplot(rel, aes(hgc, rbi, text=name)) + geom_point(alpha=.75) + geom_smooth(method="lm", se=FALSE, linewidth=.6, color="#77c5d5") +
  labs(x="HGC (z)", y="RBI (latest)", subtitle="Lower RBI & higher HGC → better dynamics")
plotly_or_gg(p)
## `geom_smooth()` using formula = 'y ~ x'

Affordability (RBI) vs Supply Growth (HGC)

YIMBY criteria (example): - RBI below median, HGC above median - 5yr permits per k in top 40% - Income growth above median

med_rbi <- median(rel$rbi, na.rm=TRUE)
med_hgc <- median(rel$hgc, na.rm=TRUE)

income_growth <- cbsa_panel |>
  group_by(cbsa) |>
  summarize(inc_growth = (last(income) - first(income))/first(income))

yimby_tbl <- rel |>
  left_join(income_growth, by="cbsa") |>
  mutate(flag_rbi = rbi < med_rbi,
         flag_hgc = hgc > med_hgc,
         flag_5yr = sum5_per_k >= quantile(sum5_per_k, .6, na.rm=TRUE),
         flag_inc = inc_growth > median(inc_growth, na.rm=TRUE),
         YIMBY = flag_rbi & flag_hgc & flag_5yr & flag_inc) |>
  arrange(desc(YIMBY), desc(hgc))

datatable_or_kable(yimby_tbl)

Leaderboard:

leader <- yimby_tbl |>
  mutate(score = as.numeric(flag_rbi)+as.numeric(flag_hgc)+as.numeric(flag_5yr)+as.numeric(flag_inc)) |>
  arrange(desc(score), rbi) |>
  select(name, score, rbi, hgc, sum5_per_k, inc_growth)
datatable_or_kable(leader)

Extra Credit

EC1 — Relationship Diagram

DiagrammeR::grViz("
digraph {
  graph [rankdir=LR, fontsize=10, labelloc=t, labeljust=l]
  node [shape=box, style=rounded, color='#77c5d5']
  ACS   [label='ACS: rent, income, pop, hh']
  PERM  [label='Permits (annual, 5yr)']
  JOIN  [shape=oval, label='Join: CBSA + Year']
  RBI   [label='RBI']
  HGC   [label='HGC']
  YIMBY [label='YIMBY Screen']
  ACS -> JOIN; PERM -> JOIN
  JOIN -> RBI; JOIN -> HGC
  RBI -> YIMBY; HGC -> YIMBY
}
")

EC2 — Spaghetti w/ gghighlight (NYC & LA)

hi <- c("35620","31080") # NYC & LA
p <- ggplot(ungroup(cbsa_panel), aes(year, income, group=cbsa, color=cbsa %in% hi)) +
  geom_line(alpha=.55) +
  gghighlight::gghighlight(cbsa %in% hi, use_direct_label=FALSE) +
  scale_color_manual(values=c("TRUE"="#f8a5c2","FALSE"="#c6d6f5"), guide="none") +
  labs(title="Income over time — highlighted NYC & LA", x=NULL, y="USD")
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
plotly_or_gg(p)

EC3 — Millennial Appeal Variable

# Placeholder proxy (replace with class's 25–34 share if provided)
millennial <- cbsa_panel |>
  group_by(cbsa) |>
  summarize(millennial_share = runif(1, .15, .35))

rel3 <- rel |>
  left_join(millennial, by="cbsa")

p <- ggplot(rel3, aes(hgc, rbi, size = millennial_share, text=name)) +
  geom_point(alpha=.75) + scale_size(range=c(3,10), labels=scales::percent) +
  labs(x="HGC (z)", y="RBI (latest)", title="RBI vs HGC — bubble size = Millennial appeal")
plotly_or_gg(p)

Millennial appeal vs RBI/HGC (bubble size)


Map — CBSA Choropleth

# Geometry (CBSA) for map - tigris
cbsa_geo <- NULL
if (offline && has_cache("cbsa_geo")){
  cbsa_geo <- load_cache("cbsa_geo")
} else {
  cbsa_geo <- tryCatch({
    tigris::core_based_statistical_areas(cb = TRUE, year = 2023, class = "sf") |>
      select(cbsa = GEOID, name = NAME, geometry)
  }, error=function(e) NULL)
  if (!is.null(cbsa_geo)) save_cache(cbsa_geo,"cbsa_geo")
}

if (!is.null(cbsa_geo)){
  map_df <- cbsa_geo |>
    left_join(select(rbi |> filter(year==latest_year), cbsa, rbi), by="cbsa") |>
    left_join(select(hgc, cbsa, hgc), by="cbsa")

  if (is_latex()){
    tmap::tmap_mode("plot")
    tm <- tmap::tm_shape(map_df) + tmap::tm_polygons(col="rbi", palette="RdYlGn", style="quantile") +
      tmap::tm_layout(title = "CBSA RBI (latest)")
    tm
  } else {
    pal <- leaflet::colorNumeric("RdYlGn", domain = map_df$rbi, reverse = TRUE)
    leaflet::leaflet(map_df) |>
      leaflet::addProviderTiles("CartoDB.Positron") |>
      leaflet::addPolygons(fillColor = ~pal(rbi), color="#ffffff", weight=0.5, fillOpacity=.8,
        popup = ~paste0("<b>", name, "</b><br>RBI: ", round(rbi,1), "<br>HGC: ", round(hgc,2))) |>
      leaflet::addLegend(pal = pal, values = ~rbi, title = "RBI (higher worse)")
  }
} else {
  cat("Map skipped (geometry unavailable offline without tigris/sf system libs).")
}

Glossary

CBSA

Core-Based Statistical Area — standard metro delineation in federal stats.

RBI

Rent Burden Index: normalized rent-to-income vs a baseline (100 = baseline).

HGC

Housing Growth Composite: z-avg of instantaneous permits per 1k and 5-year permits per 1k.

YIMBY

Screen combining RBI, HGC, permits intensity, and income growth.


Policy Brief — One-Page Summary

Key KPIs

kpi <- tibble(
  KPI = c("Median RBI", "Top HGC CBSA", "YIMBY Pass Rate"),
  Value = c(round(median(rel$rbi, na.rm=TRUE),1),
            (hgc |> arrange(desc(hgc)) |> slice(1) |> left_join(select(cbsa_panel, cbsa, name), by="cbsa") |> dplyr::pull(name) |> unique()) %||% "N/A",
            scales::percent(mean((yimby_tbl$YIMBY), na.rm=TRUE), accuracy=.1))
)
gt::gt(kpi)
KPI Value
Median RBI 100.3
Top HGC CBSA Guayama, PR Metro Area
YIMBY Pass Rate 14.2%

Snapshot

p <- ggplot(rel, aes(hgc, rbi)) + geom_point(alpha=.5) + labs(x="HGC (z)", y="RBI", title="Supply vs Affordability")
if (is_latex()) print(p) else plotly::ggplotly(p)

Sponsors & Rationale

  • Primary sponsors: Top of YIMBY leaderboard (high supply response, favorable RBI).
  • Co-sponsors: Neighbors trending toward higher HGC with median-ish RBI.
  • Why fund: Align incentives → supply growth → moderated rent burden → broader affordability.

```